################################################
# Analysis for:
#
# Jordan, Soren, Hannah Paul, and Andrew Q. Philips.
#
# Required files: bypass_data_LSQ.dta
#
################################################


library(rfUtilities)
library(readstata13)
library(plm)
library(dplyr)
library(randomForest)
library(caret)
library(foreign)
library(glmnet)
library(e1071)
library(ggplot2)
library(neuralnet)
library(ranger)
library(sandwich)
library(lmtest)
library(zoo)
library(AER)
library(car)
library(carData)
library(survival)
library(Formula)
library(sfsmisc)
library(haven)
library(pdp)
library(ICEbox) # this package makes us lump all our X vars into an object
library(rdd)
library(ALEPlot)
library(scales)
library(ggpubr)

data.ho <- read.dta13("/Users/scj0014/Myfiles/Dropbox/Interpreting ML Models/LSQ research note/Example 2/bypass_data_LSQ.dta")

head(data.ho)

######################################################
# Explicit dummy versions of congress
######################################################

data.ho$cong99 <- ifelse(data.ho$cong == 99, 1, 0)
data.ho$cong99_fac <- data.ho$cong99
data.ho$cong99_fac <- factor(data.ho$cong99_fac)
table(data.ho$cong99_fac)
is.factor(data.ho$cong99_fac)

data.ho$cong100 <- ifelse(data.ho$cong == 100, 1, 0)
data.ho$cong100_fac <- data.ho$cong100
data.ho$cong100_fac <- factor(data.ho$cong100_fac)
table(data.ho$cong100_fac)
is.factor(data.ho$cong100_fac)

data.ho$cong101 <- ifelse(data.ho$cong == 101, 1, 0)
data.ho$cong101_fac <- data.ho$cong101
data.ho$cong101_fac <- factor(data.ho$cong101_fac)
table(data.ho$cong101_fac)
is.factor(data.ho$cong101_fac)

data.ho$cong102 <- ifelse(data.ho$cong == 102, 1, 0)
data.ho$cong102_fac <- data.ho$cong102
data.ho$cong102_fac <- factor(data.ho$cong102_fac)
table(data.ho$cong102_fac)
is.factor(data.ho$cong102_fac)

data.ho$cong103 <- ifelse(data.ho$cong == 103, 1, 0)
data.ho$cong103_fac <- data.ho$cong103
data.ho$cong103_fac <- factor(data.ho$cong103_fac)
table(data.ho$cong103_fac)
is.factor(data.ho$cong103_fac)

data.ho$cong104 <- ifelse(data.ho$cong == 104, 1, 0)
data.ho$cong104_fac <- data.ho$cong104
data.ho$cong104_fac <- factor(data.ho$cong104_fac)
table(data.ho$cong104_fac)
is.factor(data.ho$cong104_fac)

data.ho$cong105 <- ifelse(data.ho$cong == 105, 1, 0)
data.ho$cong105_fac <- data.ho$cong105
data.ho$cong105_fac <- factor(data.ho$cong105_fac)
table(data.ho$cong105_fac)
is.factor(data.ho$cong105_fac)

data.ho$cong106 <- ifelse(data.ho$cong == 106, 1, 0)
data.ho$cong106_fac <- data.ho$cong106
data.ho$cong106_fac <- factor(data.ho$cong106_fac)
table(data.ho$cong106_fac)
is.factor(data.ho$cong106_fac)

data.ho$cong107 <- ifelse(data.ho$cong == 107, 1, 0)
data.ho$cong107_fac <- data.ho$cong107
data.ho$cong107_fac <- factor(data.ho$cong107_fac)
table(data.ho$cong107_fac)
is.factor(data.ho$cong107_fac)

data.ho$cong108 <- ifelse(data.ho$cong == 108, 1, 0)
data.ho$cong108_fac <- data.ho$cong108
data.ho$cong108_fac <- factor(data.ho$cong108_fac)
table(data.ho$cong108_fac)
is.factor(data.ho$cong108_fac)

data.ho$cong109 <- ifelse(data.ho$cong == 109, 1, 0)
data.ho$cong109_fac <- data.ho$cong109
data.ho$cong109_fac <- factor(data.ho$cong109_fac)
table(data.ho$cong109_fac)
is.factor(data.ho$cong109_fac)

data.ho$cong110 <- ifelse(data.ho$cong == 110, 1, 0)
data.ho$cong110_fac <- data.ho$cong110
data.ho$cong110_fac <- factor(data.ho$cong110_fac)
table(data.ho$cong110_fac)
is.factor(data.ho$cong110_fac)

data.ho$cong111 <- ifelse(data.ho$cong == 111, 1, 0)
data.ho$cong111_fac <- data.ho$cong111
data.ho$cong111_fac <- factor(data.ho$cong111_fac)
table(data.ho$cong111_fac)
is.factor(data.ho$cong111_fac)

data.ho$cong112 <- ifelse(data.ho$cong == 112, 1, 0)
data.ho$cong112_fac <- data.ho$cong112
data.ho$cong112_fac <- factor(data.ho$cong112_fac)
table(data.ho$cong112_fac)
is.factor(data.ho$cong112_fac)

data.ho$cong113 <- ifelse(data.ho$cong == 113, 1, 0)
data.ho$cong113_fac <- data.ho$cong113
data.ho$cong113_fac <- factor(data.ho$cong113_fac)
table(data.ho$cong113_fac)
is.factor(data.ho$cong113_fac)


######################################################
# Explicit dummy versions of policy area
######################################################

data.ho$major1 <- ifelse(data.ho$major == 1, 1, 0)
data.ho$major1_fac <- data.ho$major1
data.ho$major1_fac <- factor(data.ho$major1_fac)
table(data.ho$major1_fac)
is.factor(data.ho$major1_fac)

data.ho$major2 <- ifelse(data.ho$major == 2, 1, 0)
data.ho$major2_fac <- data.ho$major2
data.ho$major2_fac <- factor(data.ho$major2_fac)
table(data.ho$major2_fac)
is.factor(data.ho$major2_fac)

data.ho$major3 <- ifelse(data.ho$major == 3, 1, 0)
data.ho$major3_fac <- data.ho$major3
data.ho$major3_fac <- factor(data.ho$major3_fac)
table(data.ho$major3_fac)
is.factor(data.ho$major3_fac)

data.ho$major4 <- ifelse(data.ho$major == 4, 1, 0)
data.ho$major4_fac <- data.ho$major4
data.ho$major4_fac <- factor(data.ho$major4_fac)
table(data.ho$major4_fac)
is.factor(data.ho$major4_fac)

data.ho$major5 <- ifelse(data.ho$major == 5, 1, 0)
data.ho$major5_fac <- data.ho$major5
data.ho$major5_fac <- factor(data.ho$major5_fac)
table(data.ho$major5_fac)
is.factor(data.ho$major5_fac)

data.ho$major6 <- ifelse(data.ho$major == 6, 1, 0)
data.ho$major6_fac <- data.ho$major6
data.ho$major6_fac <- factor(data.ho$major6_fac)
table(data.ho$major6_fac)
is.factor(data.ho$major6_fac)

data.ho$major7 <- ifelse(data.ho$major == 7, 1, 0)
data.ho$major7_fac <- data.ho$major7
data.ho$major7_fac <- factor(data.ho$major7_fac)
table(data.ho$major7_fac)
is.factor(data.ho$major7_fac)

data.ho$major8 <- ifelse(data.ho$major == 8, 1, 0)
data.ho$major8_fac <- data.ho$major8
data.ho$major8_fac <- factor(data.ho$major8_fac)
table(data.ho$major8_fac)
is.factor(data.ho$major8_fac)

data.ho$major9 <- ifelse(data.ho$major == 9, 1, 0)
data.ho$major9_fac <- data.ho$major9
data.ho$major9_fac <- factor(data.ho$major9_fac)
table(data.ho$major9_fac)
is.factor(data.ho$major9_fac)

data.ho$major10 <- ifelse(data.ho$major == 10, 1, 0)
data.ho$major10_fac <- data.ho$major10
data.ho$major10_fac <- factor(data.ho$major10_fac)
table(data.ho$major10_fac)
is.factor(data.ho$major10_fac)

data.ho$major11 <- ifelse(data.ho$major == 11, 1, 0)
data.ho$major11_fac <- data.ho$major11
data.ho$major11_fac <- factor(data.ho$major11_fac)
table(data.ho$major11_fac)
is.factor(data.ho$major11_fac)

data.ho$major12 <- ifelse(data.ho$major == 12, 1, 0)
data.ho$major12_fac <- data.ho$major12
data.ho$major12_fac <- factor(data.ho$major12_fac)
table(data.ho$major12_fac)
is.factor(data.ho$major12_fac)

data.ho$major13 <- ifelse(data.ho$major == 13, 1, 0)
data.ho$major13_fac <- data.ho$major13
data.ho$major13_fac <- factor(data.ho$major13_fac)
table(data.ho$major13_fac)
is.factor(data.ho$major13_fac)

data.ho$major14 <- ifelse(data.ho$major == 14, 1, 0)
data.ho$major14_fac <- data.ho$major14
data.ho$major14_fac <- factor(data.ho$major14_fac)
table(data.ho$major14_fac)
is.factor(data.ho$major14_fac)

data.ho$major15 <- ifelse(data.ho$major == 15, 1, 0)
data.ho$major15_fac <- data.ho$major15
data.ho$major15_fac <- factor(data.ho$major15_fac)
table(data.ho$major15_fac)
is.factor(data.ho$major15_fac)

data.ho$major16 <- ifelse(data.ho$major == 16, 1, 0)
data.ho$major16_fac <- data.ho$major16
data.ho$major16_fac <- factor(data.ho$major16_fac)
table(data.ho$major16_fac)
is.factor(data.ho$major16_fac)

data.ho$major17 <- ifelse(data.ho$major == 17, 1, 0)
data.ho$major17_fac <- data.ho$major17
data.ho$major17_fac <- factor(data.ho$major17_fac)
table(data.ho$major17_fac)
is.factor(data.ho$major17_fac)

data.ho$major18 <- ifelse(data.ho$major == 18, 1, 0)
data.ho$major18_fac <- data.ho$major18
data.ho$major18_fac <- factor(data.ho$major18_fac)
table(data.ho$major18_fac)
is.factor(data.ho$major18_fac)

data.ho$major19 <- ifelse(data.ho$major == 19, 1, 0)
data.ho$major19_fac <- data.ho$major19
data.ho$major19_fac <- factor(data.ho$major19_fac)
table(data.ho$major19_fac)
is.factor(data.ho$major19_fac)

data.ho$major20 <- ifelse(data.ho$major == 20, 1, 0)
data.ho$major20_fac <- data.ho$major20
data.ho$major20_fac <- factor(data.ho$major20_fac)
table(data.ho$major20_fac)
is.factor(data.ho$major20_fac)

data.ho$major21 <- ifelse(data.ho$major == 21, 1, 0)
data.ho$major21_fac <- data.ho$major21
data.ho$major21_fac <- factor(data.ho$major21_fac)
table(data.ho$major21_fac)
is.factor(data.ho$major21_fac)

data.ho$major99 <- ifelse(data.ho$major == 99, 1, 0)
data.ho$major99_fac <- data.ho$major99
data.ho$major99_fac <- factor(data.ho$major99_fac)
table(data.ho$major99_fac)
is.factor(data.ho$major99_fac)

######################################################
# Explicit dummy versiosn of everything else
######################################################

data.ho$bypass_fac <- factor(data.ho$bypass)
table(data.ho$bypass_fac)
is.factor(data.ho$bypass_fac)

data.ho$minority_fac <- factor(data.ho$minority)
table(data.ho$minority_fac)
is.factor(data.ho$minority_fac)

data.ho$comc_fac <- factor(data.ho$comc)
table(data.ho$comc_fac)
is.factor(data.ho$comc_fac)

data.ho$floor_leader_fac <- factor(data.ho$floor_leader)
table(data.ho$floor_leader_fac)
is.factor(data.ho$floor_leader_fac)

data.ho$up_for_election_fac <- factor(data.ho$up_for_election)
table(data.ho$up_for_election_fac)
is.factor(data.ho$up_for_election_fac)

data.ho$duplicate_variable_fac <- factor(data.ho$duplicate_variable)
table(data.ho$duplicate_variable_fac)
is.factor(data.ho$duplicate_variable_fac)

data.ho$impbill_fac <- factor(data.ho$impbill)
table(data.ho$impbill_fac)
is.factor(data.ho$impbill_fac)

data.ho$partybill_fac <- factor(data.ho$partybill)
table(data.ho$partybill_fac)
is.factor(data.ho$partybill_fac)

######################################################
# Strict Manuscript of original model. 
#  Table 2, Dichotomous model, p. 509
######################################################

model.replicate <- glm(bypass ~ extremity100_scaled + minority + extremity100_scaled:minority + 
	comc + floor_leader + up_for_election + duplicate_variable + months_remaining + 
	bills_introduced + sen_polarization100 + spons_years + cosponsr2 + 
	minority:cosponsr2 + impbill + partybill + 
	cong100 + cong101 + cong102 + cong103 + cong104 + cong105 +
	cong106 + cong107 + cong108 + cong109 + cong110 + cong111 + cong112 +
	cong113 + major2 + major3 + major4 + major5 + major6 + major7 +
	major8 + major9 + major10 + major12 + major13 + major14 + major15 +
	major16 + major17 + major18 + major19 + major20 + major21 + major99, 
	data = data.ho, family = binomial(link = "logit"))

model.replicate.cluster <- coeftest(model.replicate, vcov = vcovCL, cluster = ~pooleid)

cbind(round(model.replicate.cluster[,1], digits = 3), "&", 
	paste0("(", round(model.replicate.cluster[,2], digits = 3), ")"), "\\")

# Use the factorized versions just to make sure it's all the smae
model.replicate.fac <- glm(bypass_fac ~ extremity100_scaled + minority_fac + extremity100_scaled:minority_fac + 
	comc_fac + floor_leader_fac + up_for_election_fac + duplicate_variable_fac + months_remaining + 
	bills_introduced + sen_polarization100 + spons_years + cosponsr2 + 
	minority_fac:cosponsr2 + impbill_fac + partybill_fac +
	cong100_fac + cong101_fac + cong102_fac + cong103_fac + cong104_fac + cong105_fac +
	cong106_fac + cong107_fac + cong108_fac + cong109_fac + cong110_fac + cong111_fac + cong112_fac +
	cong113_fac + major2_fac + major3_fac + major4_fac + major5_fac + major6_fac + major7_fac +
	major8_fac + major9_fac + major10_fac + major12_fac + major13_fac + major14_fac + major15_fac +
	major16_fac + major17_fac + major18_fac + major19_fac + major20_fac + major21_fac + major99_fac, 
	data = data.ho, family = binomial(link = "logit"))

summary(model.replicate.fac)

# Stripping the names, are they the same?
identical(cbind(coef(model.replicate), coef(model.replicate.fac))[,1], 
				cbind(coef(model.replicate), coef(model.replicate.fac))[,2])



###############################################################################
######################## ML Time. Random forest models ########################
###############################################################################
#########################################################
######################## Estimate #######################
#########################################################
####################################################
# Estimate (no interactions, non-linear ML model!) #
####################################################

# Take the coefficients from the factor model, exluding the interactions
ho.rf.X.nointaxn <- attr(model.replicate.fac$terms, which = "term.labels")[!(attr(model.replicate.fac$terms, which = "term.labels") %in% 
						c("extremity100_scaled:minority_fac", "minority_fac:cosponsr2"))]

# Frame the data, including the bypass DV
ho.rf.data.nointaxn <- data.ho[names(data.ho) %in% c(ho.rf.X.nointaxn, "bypass")]

# Remove missing observations
ho.rf.data.nona.nointaxn <- na.omit(ho.rf.data.nointaxn)

# Set hyperparameters using ranger. You MUST create this.
makeHyper <- function(min.mtry, max.mtry, mtry.increment, 
					min.node, max.node, node.increment,
					min.tree, max.tree, tree.increment) {
	combinations <- expand.grid(mtry = seq(min.mtry, max.mtry, mtry.increment),
													node.size = seq(min.node, max.node, node.increment),
													tree.size = seq(min.tree, max.tree, tree.increment),
													OOB.RMSE = NA)
	combinations$comb <- 1:dim(combinations)[1]
	combinations
}

hyperparameter.grid.HO <- makeHyper(min.mtry = 2, max.mtry = 12, mtry.increment = 2,
									min.node = 2, max.node = 11, node.increment = 2,
									min.tree = 200, max.tree = 2000, tree.increment = 100)

# Choose one: if you're running a new model, you'd want to run this. It's searching over the parameter
#  combinations and calculating the RMSE for each. We'll end up using the minimum RMSE
#  So normally, we'd 
# [1] Run the grid search, which might take a while, and pull the parameters from there.
#
# set.seed(27)
# for(i in 1:nrow(hyperparameter.grid.HO)) {
# 	the.model <- ranger(formula = as.factor(bypass) ~ ., 
#						data = ho.rf.data.nona.nointaxn,
#						num.trees = hyperparameter.grid.HO$tree.size[i],
#						mtry = hyperparameter.grid.HO$mtry[i],
#						min.node.size = hyperparameter.grid.HO$node.size[i])
#	hyperparameter.grid.HO$OOB.RMSE[i] <- sqrt(the.model$prediction.error)
# }
#
# And then set the combination with the minimum RMSE.
#
# hyperparameter.grid.HO[which(hyperparameter.grid.HO$OOB.RMSE == min(hyperparameter.grid.HO$OOB.RMSE)), "comb"]
# Best fit with min 2 node size
#     mtry node.size tree.size OOB.RMSE comb
# 522   12         4      1900 0.290814  522
#
# Assign it as the combination to use.
# the.combo <- hyperparameter.grid.HO[which(hyperparameter.grid.HO$OOB.RMSE == min(hyperparameter.grid.HO$OOB.RMSE)), "comb"]
#
# Since this takes a long time, though, we're going to use the previous results.
# [2] Rely on previous results to pull the relevant combination.
 the.combo <- 522

set.seed(120)
# Random forest (also takes a long time)
full.rf.ho.nointaxn <- randomForest(as.factor(bypass)  ~ ., 
							data = ho.rf.data.nona.nointaxn, 
							ntree = hyperparameter.grid.HO[hyperparameter.grid.HO$comb == the.combo,]$tree.size, 
							mtry = hyperparameter.grid.HO[hyperparameter.grid.HO$comb == the.combo,]$mtry,
							nodesize = hyperparameter.grid.HO[hyperparameter.grid.HO$comb == the.combo,]$node.size,
							importance = TRUE)
							

#########################################################
########################## VIP ##########################
#########################################################
#########################
# VIP (single variable) #
#########################
# How to make a VIP.
# [1] Save the variable names and importance measures from the ML model
#     We will call this saved data ``ho.X.vars.nointaxn.vip.dat''
ho.X.vars.nointaxn.vip.dat <- data.frame(names = rownames(full.rf.ho.nointaxn$importance), 
								MDA = full.rf.ho.nointaxn$importance[,1]/full.rf.ho.nointaxn$importanceSD)

# [2] Add the mean decrease in accuracy measure to ``ho.X.vars.nointaxn.vip.dat'' as explicitly numeric
ho.X.vars.nointaxn.vip.dat$MDA <- as.numeric(ho.X.vars.nointaxn.vip.dat$MDA.MeanDecreaseAccuracy)

# [3] What do we want to plot and draw attention to?
#     [a] All variables. We're going to change the variable names for prettier plotting
ho.X.vars.nointaxn.vip.dat$Variable <- c("Polarization", "Cosponsors", "Bills Introduced", "Sponsor Seniority", "Time Remaining",
							"Extremity", paste0(100:113, " Congress"), paste0("Major Area ", c(2:10, 12:21, 99)),
							"Minority Sponsor", "Committee Chair", "Floor Leader", "Up for Election", "Duplicate Bill",
							"Nontrivial Bill", "Party Bill")

#     [b] Ignore the fixed effects. We can also force ggplot to ignore the factor variables in plotting by passing a blank label
ho.X.vars.nointaxn.vip.dat$Variable2 <- c("Polarization", "Cosponsors", "Bills Introduced", "Sponsor Seniority", "Time Remaining",
							"Extremity", rep(" ", length(100:113)), rep(" ", length(c(2:10, 12:21, 99))),
							"Minority Sponsor", "Committee Chair", "Floor Leader", "Up for Election", "Duplicate Bill",
							"Nontrivial Bill", "Party Bill")

#     [c] Color the key variables. Indicate, by variable name, which variables to change colors
ho.X.vars.nointaxn.vip.dat$ho.key.var <- "no"
ho.X.vars.nointaxn.vip.dat$ho.key.var[ho.X.vars.nointaxn.vip.dat$Variable == "Extremity"] <- "yes"
ho.X.vars.nointaxn.vip.dat$ho.key.var[ho.X.vars.nointaxn.vip.dat$Variable == "Minority Sponsor"] <- "yes"
ho.X.vars.nointaxn.vip.dat$ho.key.var[ho.X.vars.nointaxn.vip.dat$Variable == "Committee Chair"] <- "yes"


# [4] Make an ordered version of importance, sorted by the value of the VIP metric
ho.X.vars.nointaxn.vip.dat.sort <- ho.X.vars.nointaxn.vip.dat %>% arrange(-MDA)

# [5] Pass to ggplot as normal. This version ignores the fixed effects and colors by key variable (yes/no in scale_fill_manual)
#     We're passing MDA: the MeanDecreaseAccuracy created above
#     (If we wanted to plot all the variable names, we would not give it the scale_x_discrete line: that
#     gives it Variable2 [the no FE version] as the variable names)
ho.X.vars.nointaxn.vip.plot <- ggplot(ho.X.vars.nointaxn.vip.dat.sort, aes(x = reorder(Variable, -MDA), 
									y = MDA, fill = ho.key.var)) +
								geom_bar(stat = "identity") + 
								theme_minimal() + 
								theme(axis.text.x = element_text(angle=75, hjust = 1), legend.position = "none") +
								scale_fill_manual(values = c("yes" = "black", "no" = "grey50")) + 
								scale_x_discrete(labels = ho.X.vars.nointaxn.vip.dat.sort$Variable2) +
								xlab("") +
								ylab("Mean Decrease Accuracy")

# pdf("...")
ho.X.vars.nointaxn.vip.plot
 dev.off()




#########################################################
########################## PDP ##########################
#########################################################
#########################
# PDP (single variable) #
#########################

# How to make a PDP. Start with extremity
# [1] Define the prediction function. Since this is a classification problem (the outcome is bypass or ~bypass),
#     the prediction function will be type 'prob'. 
ho.X.vars.nointaxn.pdppred <- function(object, newdata) {
	predict(full.rf.ho.nointaxn, newdata, type = 'prob')[, 2]
}

# [2] Extract the pdp using the partial() function on the ML model. 
#     Change pred.var to the model variable we're interested in the PDP for
#     The pred.fun argument is the prediction function defined above
the.pdp.ext <- partial(full.rf.ho.nointaxn, pred.var = "extremity100_scaled", pred.fun = ho.X.vars.nointaxn.pdppred)

# [3] Pass to ggplot as normal. Notice, we can add a rug by passing the PDP through stat_summary
ext.pdp <- ggplot() + 
					stat_summary(data = the.pdp.ext, aes(x = extremity100_scaled, y = yhat), 
						fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
          coord_cartesian(ylim=c(0.08, 0.22)) +         
          xlab("Extremity") +
					ylab("Predicted Value") + 
					geom_rug(data = ho.rf.data.nona.nointaxn, aes(x = extremity100_scaled))

# Figure 3, left side (a)
# pdf("...")
ext.pdp
 dev.off()


# [4] We could write another prediction function that gathers the mean and standard deviation
#     in each direction: https://bgreenwell.github.io/pdp/articles/pdp-se.Rmd.html (the author of pdp)
ho.X.vars.nointaxn.pdppred.sd <- function(object, newdata) {
	preds <- predict(object, newdata, type = 'prob')[, 2]
	c("mean" = mean(preds), "lower" = mean(preds) - sd(preds), "upper" = mean(preds) + sd(preds))
}

# And notice we're replicating the same code, with the new function
the.pdp.ext.sd <- partial(object = full.rf.ho.nointaxn, 
							pred.var = "extremity100_scaled", 
							pred.fun = ho.X.vars.nointaxn.pdppred.sd)

# Spread the data (alternative to tidyverse)
the.pdp.ext.sd.wide <- data.frame(extremity100_scaled = the.pdp.ext.sd$extremity100_scaled[the.pdp.ext.sd$yhat.id == "mean"],
									lower = the.pdp.ext.sd$yhat[the.pdp.ext.sd$yhat.id == "lower"],
									mean = the.pdp.ext.sd$yhat[the.pdp.ext.sd$yhat.id == "mean"],
									upper = the.pdp.ext.sd$yhat[the.pdp.ext.sd$yhat.id == "upper"])

ext.pdp.sd <- ggplot(data = the.pdp.ext.sd.wide, aes(x = extremity100_scaled, y = mean)) + 
					geom_line(lwd = 2) +
					geom_ribbon(aes(ymin = lower, ymax = upper, alpha = 0.2, linetype = "dashed")) +
					theme_minimal() +
					theme(legend.position = "none") +
					xlab("Extremity") +
					ylab("Predicted Value")

# Figure 3, right side (b)
# pdf("...")
ext.pdp.sd
 dev.off()


# Repeat for polarization
the.pdp.polr <- partial(full.rf.ho.nointaxn, pred.var = "sen_polarization100", pred.fun = ho.X.vars.nointaxn.pdppred)

polr.pdp <- ggplot() + 
					stat_summary(data = the.pdp.polr, aes(x = sen_polarization100, y = yhat), 
						fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
          coord_cartesian(ylim = c(0.05, 0.25)) +         
					xlab("Polarization") +
					ylab("Predicted Value") + 
					geom_rug(data = ho.rf.data.nona.nointaxn, aes(x = sen_polarization100))

# Figure 5
# pdf("...")
polr.pdp
 dev.off()

 
 
# the.pdp.polr.sd <- partial(object = full.rf.ho.nointaxn, 
#							pred.var = "sen_polarization100", 
#							pred.fun = ho.X.vars.nointaxn.pdppred.sd)

# the.pdp.polr.sd.wide <- data.frame(sen_polarization100 = the.pdp.polr.sd$sen_polarization100[the.pdp.polr.sd$yhat.id == "mean"],
#									lower = the.pdp.polr.sd$yhat[the.pdp.polr.sd$yhat.id == "lower"],
#									mean = the.pdp.polr.sd$yhat[the.pdp.polr.sd$yhat.id == "mean"],
#									upper = the.pdp.polr.sd$yhat[the.pdp.polr.sd$yhat.id == "upper"])

# polr.pdp.sd <- ggplot(data = the.pdp.polr.sd.wide, aes(x = sen_polarization100, y = mean)) + 
#					geom_line(lwd = 2) +
#					geom_ribbon(aes(ymin = lower, ymax = upper, alpha = 0.2, linetype = "dashed")) +
#					theme_minimal() +
#					theme(legend.position = "none") +
#					xlab("Polarization") +
#					ylab("Predicted Value")

# Figure 5, old (b)
# pdf("...")
# polr.pdp.sd
# dev.off()





#####################
# PDP (interaction) #
#####################
# [1] If we want to plot a PDP with an interaction, we just pass multiple variables to partial()
#     Notice, the prediction function is the same
#     Here, the predictors are continuous and catetorical
the.pdp.ext.min <- partial(full.rf.ho.nointaxn, 
					pred.var = c("extremity100_scaled", "minority_fac"), 
					pred.fun = ho.X.vars.nointaxn.pdppred, chull = TRUE)

# [2] The partial() object contains fits by value, so we will aggregate them to make a single point prediction
#     This time we will use tidyverse: grouping by the two values we're making predictions on
the.pdp.ext.min.group <- the.pdp.ext.min  %>% group_by(extremity100_scaled, minority_fac) %>%
						summarise(yhat = mean(yhat))

# Add a group label back for plotting
the.pdp.ext.min.group$Minority <- ifelse(the.pdp.ext.min.group$minority_fac == 1, "Minority Member", "Majority Member")

ext.min.pdp <- ggplot(the.pdp.ext.min.group) +				
						geom_line(aes(x = extremity100_scaled, y = yhat, lty = Minority), lwd = 1) +
						theme_minimal() + 
						xlab("Extremity") + 
						ylab("Predicted Value") + 
						ylim(c(0.08, 0.22)) + 
						theme(legend.position = c(0.85, 0.1), legend.background = element_rect(fill = "white")) 

# Figure 4, left side (a)
# pdf("...")
ext.min.pdp
 dev.off()


# Repeat for extremity and committee
the.pdp.ext.com <- partial(full.rf.ho.nointaxn, 
					pred.var = c("extremity100_scaled", "comc_fac"), 
					pred.fun = ho.X.vars.nointaxn.pdppred, chull = TRUE)

# Mean to plot by group
the.pdp.ext.com.group <- the.pdp.ext.com  %>% group_by(extremity100_scaled, comc_fac) %>%
						summarise(yhat = mean(yhat))
		
# Add a group label back for plotting
the.pdp.ext.com.group$Committee <- ifelse(the.pdp.ext.com.group$comc_fac == 1, "Committee Chair", "Not Committee Chair")

ext.com.plot <- ggplot(the.pdp.ext.com.group) +					
						geom_line(aes(x = extremity100_scaled, y = yhat, lty = Committee), lwd = 1) +
						theme_minimal() + 
						xlab("Extremity") + 
						ylab("Predicted Value") + 
						ylim(c(0.08, 0.22)) + 
						theme(legend.position = c(0.85, 0.1), legend.background = element_rect(fill = "white")) 

# Figure 4, right side (b)
# pdf("...")
ext.com.plot
 dev.off()


 

# Repeat for extremity and polarization
# Notice, the two predictors are continuious. This will affect plotting, but not much else
the.pdp.ext.polr <- partial(full.rf.ho.nointaxn, 
					pred.var = c("extremity100_scaled", "sen_polarization100"), 
					pred.fun = ho.X.vars.nointaxn.pdppred, chull = TRUE)

# Notice the grouping code is the same
the.pdp.ext.polr.group <- the.pdp.ext.polr  %>% group_by(extremity100_scaled, sen_polarization100) %>%
						summarise(yhat = mean(yhat))

# Notice the plotting is different: we're using the tile aesthetic
ext.polr.pdp <- ggplot(the.pdp.ext.polr.group) +
						geom_tile(aes(x = extremity100_scaled, y = sen_polarization100, fill = yhat)) +						
						scale_fill_gradientn("Predicted Value", colours=c("grey10", "grey90")) +															theme_minimal() + 
						xlab("Extremity") + 
						ylab("Polarization") 
						# ylim(c(0.08, 0.22)) + 
						# theme(legend.position = c(0.85, 0.1), legend.background = element_rect(fill = "white")) 

# Figure 6, bottom (c)
# pdf("...")
ext.polr.pdp
 dev.off()



# Interaction: polarization and minority. The pattern: partial(), group, then plot
the.pdp.polr.min <- partial(full.rf.ho.nointaxn, 
					pred.var = c("sen_polarization100", "minority_fac"), 
					pred.fun = ho.X.vars.nointaxn.pdppred, chull = TRUE)

the.pdp.polr.min.group <- the.pdp.polr.min %>% group_by(sen_polarization100, minority_fac) %>%
						summarise(yhat = mean(yhat))

# Grouping label
the.pdp.polr.min.group$Minority <- ifelse(the.pdp.polr.min.group$minority_fac == 1, "Minority Member", "Majority Member")

polr.min.pdp <- ggplot(the.pdp.polr.min.group) +
						geom_line(aes(x = sen_polarization100, y = yhat, lty = Minority), lwd = 1) +
						theme_minimal() + 
						xlab("Polarization") + 
						ylab("Predicted Value") + 
						ylim(c(0.05, 0.25)) + 
						theme(legend.position = c(0.85, 0.9), legend.background = element_rect(fill = "white")) 

# Figure 6, left side (a)
# pdf("...")
polr.min.pdp
 dev.off()

 

# Interaction: polarization and committee.
the.pdp.polr.com <- partial(full.rf.ho.nointaxn, 
					pred.var = c("sen_polarization100", "comc_fac"), 
					pred.fun = ho.X.vars.nointaxn.pdppred, chull = TRUE)
					
the.pdp.polr.com.group <- the.pdp.polr.com %>% group_by(sen_polarization100, comc_fac) %>%
						summarise(yhat = mean(yhat))

# Grouping label
the.pdp.polr.com.group$Committee <- ifelse(the.pdp.polr.com.group$comc_fac == 1, "Committee Chair", "Not Committee Chair")

polr.com.pdp <- ggplot(the.pdp.polr.com.group) +
						geom_line(aes(x = sen_polarization100, y = yhat, lty = Committee), lwd = 1) +	
						theme_minimal() + 
						xlab("Polarization") + 
						ylab("Predicted Value") + 
						ylim(c(0.05, 0.25)) + 
						theme(legend.position = c(0.85, 0.9), legend.background = element_rect(fill = "white")) 

# Figure 6, right side (b)
# pdf("...")
polr.com.pdp
 dev.off()







#########################################################
########################## ICE ##########################
#########################################################
#########################
# ICE (single variable) #
#########################


# If you found the SD addition to the PDP unsatisfying, ICE is the answer for you! We should plot
#  the indidividual conditional lines, not the SD group

# [1] If we want to plot an ICE, we have to pay some attention to the number of lines. Here, the number
#     of y.hats is
length(unique(the.pdp.ext$yhat.id))

# 480006. So we will sample 500 lines to draw

set.seed(77281)
lines.to.draw <- 500

the.ext.yhats <- sample(1:max(the.pdp.ext$yhat.id), lines.to.draw, replace = FALSE)
the.pdp.ext.small <- the.pdp.ext[the.pdp.ext$yhat.id %in% the.ext.yhats,]

# Compare the two approaches: what a too-dense plot looks like, versus the sampled.
ext.ice <- ggplot() + 
					geom_line(data = the.pdp.ext, aes(x = extremity100_scaled, y = yhat, group = yhat.id), alpha = 0.2) +
					stat_summary(data = the.pdp.ext, aes(x = extremity100_scaled, y = yhat), 
									fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
					xlab("Extremity") +
					ylab("Predicted Value") + 
					geom_rug(data = ho.rf.data.nona.nointaxn, aes(x = extremity100_scaled))

ext.ice.small <- ggplot() + 
					geom_line(data = the.pdp.ext.small, aes(x = extremity100_scaled, y = yhat, group = yhat.id), alpha = 0.2) +
					stat_summary(data = the.pdp.ext.small, aes(x = extremity100_scaled, y = yhat), 
									fun = mean, geom = "line", col = "black", size = 2) +
					theme_minimal() +
					xlab("Extremity") +
					ylab("Predicted Value") + 
					geom_rug(data = ho.rf.data.nona.nointaxn, aes(x = extremity100_scaled))

# Figure 7, left side (a)
# pdf("...")
ext.ice
 dev.off()


 
# Figure 7, right side (b)
# pdf("...")
ext.ice.small
 dev.off()





